# Load packages here
library("tidyverse")
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'hms'
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5 ✔ purrr 0.3.4
## ✔ tibble 3.1.5 ✔ dplyr 1.0.6
## ✔ tidyr 1.1.3 ✔ stringr 1.4.0
## ✔ readr 1.4.0 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
In this exercise, the goal is to create one of the most famous plots in chaos theory. The equation of the logistic map is very simple, but its behaviour stunningly complex:
\[ x_{n+1} = rx_{n}(1-x_{n}) \]
Starting with an initial value of \(x_{0}\) between one and zero, e.g. 0.5, and setting a constant value of r e.g. between zero and four, the equation is iterated forward and thereby computes \(x_{1}, x_{2}\), etc. We will only care about visualisation here, but if you are interested in learning more about the background of the equation and plot, e.g. have a look at this or this video.
The goal is to create a plot with different values of r on the x-axis and then x values on the y-axis corresponding to each r value. In parts of the plot, all these x values will be on a single point, but for other r values x moves perpetually.
The following code chunk computes the main dataset of the plot for
you. You are welcome to study the code, but this is not part of the
assignment and you do not have to worry about how exactly it works (this
is not a course about chaos theory after all). Data contained in
logistic_map_data is already in a tidy format, one variable
denotes the value of r, one variable the value of the associated x’s.
For each value of r repeated over \(n=1000\) rows, there are \(n\) associated rows of x values (these can
be constant or fluctuating, depending on the value of r). Only some
information for the colour still has to be added.
# x observations for each r value
n <- 1000
# Step between each r value
r_step <- 0.001
r_range <- seq(2.5, 4, by = r_step)
to_discard <- 500 # numbers of observations discarded before the n which are stored
logistic_map_data <- matrix(0, nrow = n*length(r_range), 2)
for (r in r_range) {
current_logistic_map_series <- numeric(n+to_discard)
current_logistic_map_series[1] <- 0.5
for (k in 1:(n+to_discard-1)) {
current_logistic_map_series[k+1] <- r*current_logistic_map_series[k]*(1-current_logistic_map_series[k])
}
start_index <- 1+n*(match(r, r_range) - 1)
end_index <- n*match(r, r_range)
logistic_map_data[start_index:end_index,1] <- r
logistic_map_data[start_index:end_index,2] <- tail(current_logistic_map_series,n)
}
logistic_map_data <- as_tibble(data.frame(logistic_map_data))
colnames(logistic_map_data) <- c("r", "x")
Hint: Create your final dataset with n <- 1000 and
r_step <- 0.001, however, for these values it takes R
some time to compute the plot. When building your plot, adjusting axes,
colours, etc., one approach is to first use e.g. n <- 10
and r_step <- 0.01 until you have a version of the plot
that you are happy with. Just note that the opacity parameter will have
to be decreased again once you have increased n because now
there are more points in the plot.
# Your code here
#Sources:
#https://stackoverflow.com/questions/29966582/ggplot-geom-point-with-colors-based-on-specific-discrete-values
#https://felixfan.github.io/ggplot2-remove-grid-background-margin/
#https://imagecolorpicker.com
ggplot(logistic_map_data, aes(x = r, y = x)) +
geom_point(aes(colour = cut(r, c(-Inf, 3.5, 3.6, 3.7, 3.8, 3.9, 4))), size = 0.000001, alpha = 0.01) +
scale_colour_manual(values = c("(-Inf,3.5]" = "#ff7473",
"(3.5,3.6]" = "#b1ac11",
"(3.6,3.7]" = "#21c332",
"(3.7,3.8]" = "#78d8da",
"(3.8,3.9]" = "#7495ff",
"(3.9,4]" = "#f666e0")) +
theme(legend.position = "none", panel.grid.minor = element_blank(), panel.grid.major = element_blank(), panel.background = element_blank(), axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank())
In this exercise, try to replicate the following figure that displays
the average popularity metrics of legislators grouped by gender and
party. Note that this example first involves some reshaping of the data
which you can do with dplyr from the
tidyverse.
# Data for the plot
fb <- read.csv("data/fb-congress-data.csv", stringsAsFactors=FALSE)
# Your code here
#Sources:
#https://stackoverflow.com/questions/36003699/how-can-i-create-a-new-column-based-on-conditional-statements-and-dplyr
#https://www.datanovia.com/en/lessons/combine-multiple-ggplots-into-a-figure/#combine-multiple-ggplots-using-ggarrange
#https://rdrr.io/cran/ggpubr/man/annotate_figure.html
library(ggpubr)
fb_mod <- fb %>%
filter(party == c("Democrat", "Republican")) %>%
group_by(gender, party) %>%
summarise(likes_count = mean(likes_count, na.rm = T),
comments_count = mean(comments_count, na.rm = T),
shares_count = mean(shares_count, na.rm = T),
love_count = mean(love_count, na.rm = T),
haha_count = mean(haha_count, na.rm = T),
wow_count = mean(wow_count, na.rm = T),
angry_count = mean(angry_count, na.rm = T),
sad_count = mean(sad_count, na.rm = T)) %>%
mutate(category = case_when(
gender == "F" & party == "Democrat" ~ "D-F",
gender == "F" & party == "Republican" ~ "R-F",
gender == "M" & party == "Democrat" ~ "D-M",
gender == "M" & party == "Republican" ~ "R-M"
))
## `summarise()` has grouped output by 'gender'. You can override using the
## `.groups` argument.
g1 <- ggplot(fb_mod, aes(fill = category, y = likes_count, x = category)) +
geom_bar(position="dodge", stat="identity") +
scale_fill_manual(values = c("D-F" = "#01178b", "R-F" = "#8b0f00", "D-M" = "#0432ff", "R-M" = "#ff2600")) +
theme(legend.position = "none", axis.title.x = element_blank())
g2 <- ggplot(fb_mod, aes(fill = category, y = comments_count, x = category)) +
geom_bar(position="dodge", stat="identity") +
scale_fill_manual(values = c("D-F" = "#01178b", "R-F" = "#8b0f00", "D-M" = "#0432ff", "R-M" = "#ff2600")) +
theme(legend.position = "none", axis.title.x = element_blank())
g3 <- ggplot(fb_mod, aes(fill = category, y = shares_count, x = category)) +
geom_bar(position="dodge", stat="identity") +
scale_fill_manual(values = c("D-F" = "#01178b", "R-F" = "#8b0f00", "D-M" = "#0432ff", "R-M" = "#ff2600")) +
theme(legend.position = "none", axis.title.x = element_blank())
g4 <- ggplot(fb_mod, aes(fill = category, y = love_count, x = category)) +
geom_bar(position="dodge", stat="identity") +
scale_fill_manual(values = c("D-F" = "#01178b", "R-F" = "#8b0f00", "D-M" = "#0432ff", "R-M" = "#ff2600")) +
theme(legend.position = "none", axis.title.x = element_blank())
g5 <- ggplot(fb_mod, aes(fill = category, y = haha_count, x = category)) +
geom_bar(position="dodge", stat="identity") +
scale_fill_manual(values = c("D-F" = "#01178b", "R-F" = "#8b0f00", "D-M" = "#0432ff", "R-M" = "#ff2600")) +
theme(legend.position = "none", axis.title.x = element_blank())
g6 <- ggplot(fb_mod, aes(fill = category, y = wow_count, x = category)) +
geom_bar(position="dodge", stat="identity") +
scale_fill_manual(values = c("D-F" = "#01178b", "R-F" = "#8b0f00", "D-M" = "#0432ff", "R-M" = "#ff2600")) +
theme(legend.position = "none", axis.title.x = element_blank())
g7 <- ggplot(fb_mod, aes(fill = category, y = angry_count, x = category)) +
geom_bar(position="dodge", stat="identity") +
scale_fill_manual(values = c("D-F" = "#01178b", "R-F" = "#8b0f00", "D-M" = "#0432ff", "R-M" = "#ff2600")) +
theme(legend.position = "none", axis.title.x = element_blank())
g8 <- ggplot(fb_mod, aes(fill = category, y = sad_count, x = category)) +
geom_bar(position="dodge", stat="identity") +
scale_fill_manual(values = c("D-F" = "#01178b", "R-F" = "#8b0f00", "D-M" = "#0432ff", "R-M" = "#ff2600")) +
theme(legend.position = "none", axis.title.x = element_blank())
plot <- ggarrange(g1, g2, g3, g4, g5, g6, g7, g8, ncol = 4, nrow = 2)
annotate_figure(plot, top = text_grob("Partisan asymmetries by gender in Facebook popularity metrics"), left = text_grob("Average of each type of social metric", rot = 90), bottom = text_grob("Party and gender of Member of Congress"))
For this exercise, try to replicate the plot below, which Pablo Barbera prepared for a Washington Post blog post a few years ago.
The plot combines two sources of data: The ideology estimates for
each actor (available in ideology_1.csv) and a random
sample of ideology estimates for the three density plots (in
ideology_2.csv).
As a clue, Pablo used theme_tufte from the
ggthemes package as main theme (which he then edited
manually). But there may be other ways of replicating it.
# Data for main plot
ideology <- read.csv("data/ideology_1.csv")
# Data for background plots
bg <- read.csv("data/ideology_2.csv")
#Sources:
#https://ggplot2.tidyverse.org/reference/geom_linerange.html
#https://www.rdocumentation.org/packages/ggplot2/versions/0.9.1/topics/scale_colour_manual
#http://www.cookbook-r.com/Graphs/Plotting_distributions_(ggplot2)/
# Your code here
library(ggthemes)
ideology$screen_name <- factor(ideology$screen_name, levels = ideology$screen_name)
av_dem <- mean(ideology$twscore[ideology$party=="Democrat"])
av_rep <- mean(ideology$twscore[ideology$party=="Republican"])
d <- ggplot(bg, aes(x = ideology, fill = type)) + geom_density(data = bg, aes(x = ideology, fill = type), alpha=0.3, colour = NA) +
scale_fill_manual(values = c("Republican" = "red", "Democrat" = "blue", "Z" = "black"))
ggplot(ideology, aes(x = twscore, y = screen_name, color = party)) +
geom_linerange(aes(xmax = twscore + twscore.sd,
xmin = twscore - twscore.sd)) +
geom_point() +
theme_tufte() +
scale_color_manual(values=c("Republican" = "red", "Democrat" = "blue", "Z" = "black")) +
xlim(-2,2) +
geom_vline(NULL, NULL, xintercept = 0, linetype = "solid", colour = "gray", size = 0.3) +
geom_text(size = 2,aes(x=-0.1, label="\nAverage Twitter User", y=8), colour="black", angle=90) +
geom_vline(NULL, NULL, xintercept = av_dem, linetype = "solid", colour = "blue", size = 0.3, alpha = 0.3) +
geom_text(size = 2,aes(x=av_dem-0.1, label="\nAverage Democrat in 114th Congress", y=8), colour="black", angle=90) +
geom_vline(NULL, NULL, xintercept = av_rep, linetype = "solid", colour = "red", size = 0.3, alpha = 0.3) +
geom_text(size = 2,aes(x=av_rep-0.1, label="\nAverage Republican in 114th Congress", y=8), colour="black", angle=90) +
geom_label(size = 2, aes(x = twscore, y = screen_name, label = screen_name), label.size = NA, alpha = 0.3, position = position_nudge(y = 0, x = -0.25)) +
theme(axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), legend.position = "none", plot.title = element_text(size=10, hjust = 0.5, face = "bold")) +
ggtitle("Twitter ideology scores of potential Democratic and Republican presidential primary candidates") +
xlab("Position on latent ideological scale")
d
In this exercises you can visualise data about a topic you are interested in.
First download the data. If you are looking for ideas, e.g. have a
look at health data from the World Health Organization,
economic data from the St. Louis
Federal Reserve, or data on wealth and income inequality from the World Inequality Database. Once you
have downloaded the data, load it into R and process it, and then
explore and illustrate it with plots created with ggplot2
and/or plotly. You can also add brief explanations through
markdown text.
More extensive, carefully thought out, polished, and well understandable answers will receive more points.
Note: Some of these data can also be obtained via APIs, but you can just manually download files such as .csv. for this assignment. This has no effect on the grade.
Comment on the obtained results:
If we consider the correlation coefficients for the first genre, Electronic, we can observe that the highest ones are recorded between energy and loudness (0.759), valence and danceability (0.392), loudness and acousticness (-0.5), and energy and acoustictess (-0.523). A similar observation can be made about other genres as well: the correlation coefficients for these combinations of variables are also mostly the highest and show the same character of relationship. This means this is most likely common for music in general, regardless of genre. Speaking of some genre-specific observations, the correlation coefficients for the genre “Anime” for most variables appear to be higher than for other genres. This could be due to the fact that there is little data available on this genre in the dataset, or simply due to the fact that in case of this genre a lot of variables have a high correlation.
The distributions for each song characteristic in each genre appear to share similarities as well. The distribuion of popularity resembles the normal distribution with classical music probably being the only exception because for this genre the distribution is more similar to bimodal. Danceability and duration mostly resemble the normal distribution as well.
# Your code here
#Sources:
#https://r-graph-gallery.com/199-correlation-matrix-with-ggally.html
#https://stackoverflow.com/questions/40278906/apply-a-function-to-all-the-elements-of-a-data-frame
#https://stackoverflow.com/questions/41577362/suppress-ggpairs-messages-when-generating-plot
#https://ggobi.github.io/ggally/reference/ggpairs.html
#https://stackoverflow.com/questions/62312493/ggallyggpairs-plot-does-not-display-correlation-coefficients
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
music_data <- read.csv("data/music_genre.csv")
genres <- unique(music_data$music_genre)
genres <- genres[-3]
#dt_genre <- music_data[music_data$music_genre == "Rock",]
num_cols <- c("popularity","acousticness","danceability","duration_ms","energy","instrumentalness","liveness","loudness","speechiness","tempo","valence")
correlogram_genre <- function(x) {
#Takes a music genre as a string and returns a scatterplot matrix
#of musical characteristics of this genre
dt_genre <- music_data[music_data$music_genre == x,]
dt_genre_num <- dt_genre[,num_cols]
dt_genre_num <- dt_genre_num %>%
mutate_each(funs(as.numeric))
dt_genre_num$duration_ms[dt_genre_num$duration_ms==-1] <- NA
dt_genre_num <- na.omit(dt_genre_num)
plot <- ggpairs(dt_genre_num,
title=paste("Correlogram for the genre", x, sep = " "),
progress = FALSE,
upper = list(continuous = wrap('cor', size = 2)),
lower = list(continuous = wrap("points", alpha = 0.3, size = 0.1), combo = wrap("dot_no_facet", alpha = 0.3)))
print(plot)
}
for (i in 1:length(genres)){
correlogram_genre(genres[i])
}
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## Warning: `mutate_each_()` was deprecated in dplyr 0.7.0.
## Please use `across()` instead.
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion